#!/usr/bin/perl -w


################################################################################
#
# Author: Tim Baldwin
#
# Synposis: Evaluate the sentence tokenisation accuracy of the tokenisation script
#
# License: LGPL (see http://malay-toklem.googlecode.com/)
#
# Usage: see README
#
################################################################################





$HOME=".";



$SYSOUTDIR = "$HOME/tokeniser.out/";

$GOLDDIR = "$HOME/eval/corpora/corpus.tokenised/";



$SSEG = '^';








use Getopt::Long;

# Get Command-Line options.
GetOptions("<>" => sub{$SYSOUTDIR = $_[0]},
	   "v" => \$VERBOSE,
           );





while ($GOLD = <$GOLDDIR/*.txt>)
{
    print STDERR "$GOLD\n";
    $GOLD =~ /.*\/(.+)$/;
    $SYSOUT = $SYSOUTDIR . $1;
    die "ERROR: Couldn't locate $SYSOUT\n" unless -f $SYSOUT;

    &file_eval($GOLD,$SYSOUT);
}



$prec = ($tp or $fp) ? $tp / ($tp+$fp) : 0;
$rec = ($tp or $fn) ? $tp / ($tp+$fn) : 0;
$fscore = ($prec or $rec) ? 2*$prec*$rec / ($prec+$rec) : 0;


printf "P = %.3f, R = %.3f, F = %.3f\n", $prec, $rec, $fscore;






sub file_eval
{
    ($GOLD,$SYSOUT) = @_;

    open GOLD or die;
    open SYSOUT or die;

    $lineno = 1;
    while ($goldline = <GOLD>)
    {
	$goldline =~ s/  +/ /g;
	$sysoutline = <SYSOUT>;
	$sysoutline =~ s/  +/ /g;
	die "ERROR: Unexpected EOF at $lineno of $SYSOUT\n" unless $sysoutline;

#	print "$goldline <-> $sysoutline\n";

	my @goldwords = split / /, $goldline;
	my @sysoutwords = split / /, $sysoutline;
	my $gtoken = shift @goldwords;
	my $otoken = shift @sysoutwords;
#	print STDERR "ERROR (line $lineno): mismatch in token count $#goldwords <-> $#sysoutwords\n";
	while ($#goldwords >= 0)
	{
	    if ($gtoken eq $SSEG)
	    {
		if ($otoken eq $SSEG)
		{
		    $tp++;
		    $gtoken = shift @goldwords;
		    $otoken = shift @sysoutwords;
		}
		else
		{
		    $fn++;
		    $gtoken = shift @goldwords;
		    print "$GOLD ($lineno)\nGOLD  : ${goldline}SYSOUT: $sysoutline\n" if $VERBOSE;
		}
	    }
	    else
	    {
		if ($otoken eq $SSEG)
		{
		    $fp++;
		    $otoken = shift @sysoutwords;
		}
		else
		{
		    $gtoken = shift @goldwords;
		    $otoken = shift @sysoutwords;
		}
		
	    }
	}
	$lineno++;
    }

    close GOLD;
    close SYSOUT;
}

